library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(DT)
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
library(knitr)
## Warning: package 'knitr' was built under R version 3.6.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(correlationfunnel)
## Warning: package 'correlationfunnel' was built under R version 3.6.3
## == correlationfunnel Tip #1 ============================================================
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(e1071)
## Warning: package 'e1071' was built under R version 3.6.3
Train<-read.csv("pml-training.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
Test<-read.csv("pml-testing.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dim(Train);
## [1] 19622 160
dim(Test)
## [1] 20 160
train_split <- createDataPartition(Train$classe, p = 0.8, list = F)
Val_data <- Train[-train_split,]
Train <- Train[train_split,]
dim(Train);
## [1] 15699 160
dim(Val_data)
## [1] 3923 160
table(Train$classe)/nrow(Train)
##
## A B C D E
## 0.2843493 0.1935155 0.1744060 0.1638958 0.1838334
mb <- sapply(select(Train,names(Train)[grepl("_belt",names(Train))]),function(x) sum(is.na(x)))
mb
## roll_belt pitch_belt yaw_belt
## 0 0 0
## total_accel_belt kurtosis_roll_belt kurtosis_picth_belt
## 0 15385 15407
## kurtosis_yaw_belt skewness_roll_belt skewness_roll_belt.1
## 15699 15385 15407
## skewness_yaw_belt max_roll_belt max_picth_belt
## 15699 15379 15379
## max_yaw_belt min_roll_belt min_pitch_belt
## 15385 15379 15379
## min_yaw_belt amplitude_roll_belt amplitude_pitch_belt
## 15385 15379 15379
## amplitude_yaw_belt var_total_accel_belt avg_roll_belt
## 15385 15379 15379
## stddev_roll_belt var_roll_belt avg_pitch_belt
## 15379 15379 15379
## stddev_pitch_belt var_pitch_belt avg_yaw_belt
## 15379 15379 15379
## stddev_yaw_belt var_yaw_belt gyros_belt_x
## 15379 15379 0
## gyros_belt_y gyros_belt_z accel_belt_x
## 0 0 0
## accel_belt_y accel_belt_z magnet_belt_x
## 0 0 0
## magnet_belt_y magnet_belt_z
## 0 0
ma <- sapply(select(Train,names(Train)[grepl("_arm",names(Train))]),function(x) sum(is.na(x)))
ma
## roll_arm pitch_arm yaw_arm total_accel_arm
## 0 0 0 0
## var_accel_arm avg_roll_arm stddev_roll_arm var_roll_arm
## 15379 15379 15379 15379
## avg_pitch_arm stddev_pitch_arm var_pitch_arm avg_yaw_arm
## 15379 15379 15379 15379
## stddev_yaw_arm var_yaw_arm gyros_arm_x gyros_arm_y
## 15379 15379 0 0
## gyros_arm_z accel_arm_x accel_arm_y accel_arm_z
## 0 0 0 0
## magnet_arm_x magnet_arm_y magnet_arm_z kurtosis_roll_arm
## 0 0 0 15438
## kurtosis_picth_arm kurtosis_yaw_arm skewness_roll_arm skewness_pitch_arm
## 15440 15386 15438 15440
## skewness_yaw_arm max_roll_arm max_picth_arm max_yaw_arm
## 15386 15379 15379 15379
## min_roll_arm min_pitch_arm min_yaw_arm amplitude_roll_arm
## 15379 15379 15379 15379
## amplitude_pitch_arm amplitude_yaw_arm
## 15379 15379
mf <- sapply(select(Train,names(Train)[grepl("_forearm",names(Train))]),function(x) sum(is.na(x)))
mf
## roll_forearm pitch_forearm yaw_forearm
## 0 0 0
## kurtosis_roll_forearm kurtosis_picth_forearm kurtosis_yaw_forearm
## 15443 15444 15699
## skewness_roll_forearm skewness_pitch_forearm skewness_yaw_forearm
## 15443 15444 15699
## max_roll_forearm max_picth_forearm max_yaw_forearm
## 15379 15379 15443
## min_roll_forearm min_pitch_forearm min_yaw_forearm
## 15379 15379 15443
## amplitude_roll_forearm amplitude_pitch_forearm amplitude_yaw_forearm
## 15379 15379 15443
## total_accel_forearm var_accel_forearm avg_roll_forearm
## 0 15379 15379
## stddev_roll_forearm var_roll_forearm avg_pitch_forearm
## 15379 15379 15379
## stddev_pitch_forearm var_pitch_forearm avg_yaw_forearm
## 15379 15379 15379
## stddev_yaw_forearm var_yaw_forearm gyros_forearm_x
## 15379 15379 0
## gyros_forearm_y gyros_forearm_z accel_forearm_x
## 0 0 0
## accel_forearm_y accel_forearm_z magnet_forearm_x
## 0 0 0
## magnet_forearm_y magnet_forearm_z
## 0 0
md <- sapply(select(Train,names(Train)[grepl("_dumbbell",names(Train))]),function(x) sum(is.na(x)))
md
## roll_dumbbell pitch_dumbbell yaw_dumbbell
## 0 0 0
## kurtosis_roll_dumbbell kurtosis_picth_dumbbell kurtosis_yaw_dumbbell
## 15382 15379 15699
## skewness_roll_dumbbell skewness_pitch_dumbbell skewness_yaw_dumbbell
## 15382 15379 15699
## max_roll_dumbbell max_picth_dumbbell max_yaw_dumbbell
## 15379 15379 15382
## min_roll_dumbbell min_pitch_dumbbell min_yaw_dumbbell
## 15379 15379 15382
## amplitude_roll_dumbbell amplitude_pitch_dumbbell amplitude_yaw_dumbbell
## 15379 15379 15382
## total_accel_dumbbell var_accel_dumbbell avg_roll_dumbbell
## 0 15379 15379
## stddev_roll_dumbbell var_roll_dumbbell avg_pitch_dumbbell
## 15379 15379 15379
## stddev_pitch_dumbbell var_pitch_dumbbell avg_yaw_dumbbell
## 15379 15379 15379
## stddev_yaw_dumbbell var_yaw_dumbbell gyros_dumbbell_x
## 15379 15379 0
## gyros_dumbbell_y gyros_dumbbell_z accel_dumbbell_x
## 0 0 0
## accel_dumbbell_y accel_dumbbell_z magnet_dumbbell_x
## 0 0 0
## magnet_dumbbell_y magnet_dumbbell_z
## 0 0
dc <- c(names(mb[mb!=0]), names(ma[ma!=0]),names(mf[mf!=0]),names(md[md!=0]))
length(dc)
## [1] 100
data_an<-tbl_df(Train%>%select(-dc,-c(X,user_name,raw_timestamp_part_1,raw_timestamp_part_2,cvtd_timestamp,new_window,num_window)))
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(dc)` instead of `dc` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data_an$classe<-as.factor(data_an$classe)
data_an[,1:52]<-lapply(data_an[,1:52],as.numeric)
dim(data_an)
## [1] 15699 53
c_c <- cor(select(data_an, -classe))
diag(c_c) <- 0
c_c <- which(abs(c_c)>0.8,arr.ind = T)
c_c <- unique(row.names(c_c))
corrplot(cor(select(data_an,c_c)),type="upper",order="hclust",method="number")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(c_c)` instead of `c_c` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

cfd <-data_an%>%binarize(n_bins=4,thresh_infreq=0.01)
ca <- cfd %>% correlate(target=classe__A)
cb<-cfd%>%correlate(target=classe__B)
cc <- cfd%>%correlate(target=classe__C)
cd<-cfd%>%correlate(target=classe__D)
ce<-cfd %>% correlate(target = classe__E)
a_col <- c("magnet_arm_x", "pitch_forearm" , "magnet_dumbbell_y", "roll_forearm", "gyros_dumbbell_y")
b_col <- c("magnet_dumbbell_y", "magnet_dumbbell_x" , "roll_dumbbell" ,
"magnet_belt_y" , "accel_dumbbell_x" )
c_col <- c("magnet_dumbbell_y", "roll_dumbbell" , "accel_dumbbell_y" ,
"magnet_dumbbell_x", "magnet_dumbbell_z")
d_col <- c("pitch_forearm" , "magnet_arm_y" , "magnet_forearm_x",
"accel_dumbbell_y", "accel_forearm_x")
e_col <- c("magnet_belt_y" , "magnet_belt_z" , "roll_belt",
"gyros_belt_z" , "magnet_dumbbell_y")
fc <- character()
for(c in c(a_col,b_col,c_col,d_col,e_col)){
fc <- union(fc,c)
}
data_an2 <- data_an%>%select(fc,classe)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(fc)` instead of `fc` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data.frame("arm" = sum(grepl("_arm",fc)),"forearm"=sum(grepl("_forearm",fc)),"belt"=sum(grepl("_belt",fc)),"dumbbell"=sum(grepl("_dumbbell",fc)))
## arm forearm belt dumbbell
## 1 2 4 4 7
m_d<-function(data, mapping, ...) {
ggplot(data = data, mapping=mapping)+geom_density(..., alpha = 0.3)+scale_fill_brewer(palette="Set2")
}
m_p<-function(data, mapping, ...) {
ggplot(data = data, mapping=mapping)+geom_point(..., alpha = 0.1)+ scale_fill_brewer(palette="Set2")
}
ggpairs(data_an2,columns = 1:5,aes(color = classe),lower = list(continuous = m_p),diag = list(continuous = m_d))

ggpairs(data_an2,columns=6:10,aes(color=classe),lower=list(continuous=m_p),diag =list(continuous=m_d))

ggpairs(data_an2,columns = 11:17,aes(color = classe),lower=list(continuous=m_p),diag=list(continuous=m_d))

TrainF <- Train %>% select(fc,classe)
Val_dataF<-Val_data %>% select(fc,classe)
TrainF[,1:17] <- sapply(TrainF[,1:17],as.numeric)
Val_dataF[,1:17] <- sapply(Val_dataF[,1:17],as.numeric)
lvl<-c("A", "B", "C", "D", "E")
ppo <- preProcess(TrainF[,-18],method = c("center","scale","BoxCox"))
Tr_x <- predict(ppo,select(TrainF,-classe))
Tr_y <- factor(TrainF$classe,levels=lvl)
V_x <- predict(ppo,select(Val_dataF,-classe))
V_y<- factor(Val_dataF$classe,levels=lvl)
C_tr <- trainControl(method="cv", number=5)
CT_m <- train(x = Tr_x,y = Tr_y,method = "rpart", trControl = C_tr)
RF_m <- train(x = Tr_x,y = Tr_y, method = "rf", trControl = C_tr,verbose=FALSE, metric = "Accuracy")
GBM_m <- train(x = Tr_x,y = Tr_y,method = "gbm",trControl=C_tr,verbose=FALSE)
SVM_m <- svm(x = Tr_x,y = Tr_y,kernel = "polynomial", cost = 10)
confusionMatrix(predict(CT_m,V_x),V_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1011 327 321 302 106
## B 14 263 29 110 94
## C 89 169 334 231 212
## D 0 0 0 0 0
## E 2 0 0 0 309
##
## Overall Statistics
##
## Accuracy : 0.4887
## 95% CI : (0.4729, 0.5044)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.331
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9059 0.34651 0.48830 0.0000 0.42857
## Specificity 0.6238 0.92193 0.78358 1.0000 0.99938
## Pos Pred Value 0.4891 0.51569 0.32271 NaN 0.99357
## Neg Pred Value 0.9434 0.85467 0.87881 0.8361 0.88594
## Prevalence 0.2845 0.19347 0.17436 0.1639 0.18379
## Detection Rate 0.2577 0.06704 0.08514 0.0000 0.07877
## Detection Prevalence 0.5269 0.13000 0.26383 0.0000 0.07928
## Balanced Accuracy 0.7649 0.63422 0.63594 0.5000 0.71397
confusionMatrix(predict(RF_m,V_x),V_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1114 9 0 6 0
## B 0 739 6 1 1
## C 1 9 672 9 1
## D 0 2 6 627 2
## E 1 0 0 0 717
##
## Overall Statistics
##
## Accuracy : 0.9862
## 95% CI : (0.9821, 0.9896)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9826
##
## Mcnemar's Test P-Value : 0.01261
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9982 0.9736 0.9825 0.9751 0.9945
## Specificity 0.9947 0.9975 0.9938 0.9970 0.9997
## Pos Pred Value 0.9867 0.9893 0.9711 0.9843 0.9986
## Neg Pred Value 0.9993 0.9937 0.9963 0.9951 0.9988
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2840 0.1884 0.1713 0.1598 0.1828
## Detection Prevalence 0.2878 0.1904 0.1764 0.1624 0.1830
## Balanced Accuracy 0.9964 0.9856 0.9881 0.9860 0.9971
plot(RF_m$finalModel,main="Error VS no of tree")

confusionMatrix(predict(GBM_m,V_x),V_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1100 38 5 7 3
## B 8 628 42 8 11
## C 3 59 614 39 11
## D 3 31 21 586 13
## E 2 3 2 3 683
##
## Overall Statistics
##
## Accuracy : 0.9205
## 95% CI : (0.9116, 0.9287)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8993
##
## Mcnemar's Test P-Value : 2.621e-09
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9857 0.8274 0.8977 0.9114 0.9473
## Specificity 0.9811 0.9782 0.9654 0.9793 0.9969
## Pos Pred Value 0.9540 0.9010 0.8457 0.8960 0.9856
## Neg Pred Value 0.9942 0.9594 0.9781 0.9826 0.9882
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2804 0.1601 0.1565 0.1494 0.1741
## Detection Prevalence 0.2939 0.1777 0.1851 0.1667 0.1767
## Balanced Accuracy 0.9834 0.9028 0.9315 0.9453 0.9721
confusionMatrix(predict(SVM_m,V_x),V_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1111 57 26 20 3
## B 1 653 25 5 3
## C 2 40 623 41 5
## D 2 5 7 574 10
## E 0 4 3 3 700
##
## Overall Statistics
##
## Accuracy : 0.9332
## 95% CI : (0.9249, 0.9408)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9152
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9955 0.8603 0.9108 0.8927 0.9709
## Specificity 0.9622 0.9893 0.9728 0.9927 0.9969
## Pos Pred Value 0.9129 0.9505 0.8762 0.9599 0.9859
## Neg Pred Value 0.9982 0.9672 0.9810 0.9792 0.9935
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2832 0.1665 0.1588 0.1463 0.1784
## Detection Prevalence 0.3102 0.1751 0.1812 0.1524 0.1810
## Balanced Accuracy 0.9789 0.9248 0.9418 0.9427 0.9839
Test2 <- Test %>% select(fc,problem_id)
xTest <- Test2 %>% select(fc)
result <- data.frame("problem_id" = Test$problem_id,"PREDICTION_RF"=predict(RF_m,xTest),"PREDICTION_GBM"=predict(GBM_m,xTest),"PREDICTION_SVM"=predict(SVM_m,xTest))
result
## problem_id PREDICTION_RF PREDICTION_GBM PREDICTION_SVM
## 1 1 E E C
## 2 2 A E A
## 3 3 A D B
## 4 4 E E C
## 5 5 A A A
## 6 6 E D A
## 7 7 E E A
## 8 8 B D A
## 9 9 A A B
## 10 10 E E E
## 11 11 A E B
## 12 12 A D A
## 13 13 B D E
## 14 14 A D B
## 15 15 E E A
## 16 16 E E A
## 17 17 E E C
## 18 18 B E A
## 19 19 E E A
## 20 20 E E E